package a;

import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

import java.io.File;
import java.io.FileWriter;

/**
 * @author: jzw
 * @Description 提取pdf文本
 * @date: 2022/12/6 13:28
 */
public class PDFTXT {
    public static void main(String[] args) throws Exception {
        String path = "F:\\workspace\\应知应会.pdf";
        String text = getTextFromPDF(path);
    }

    public static String getTextFromPDF(String pdfFilePath) throws Exception {
        RandomAccessRead accessRead = new RandomAccessFile(new File(pdfFilePath), "rw");
        PDFParser parser = new PDFParser(accessRead); // 创建PDF解析器
        parser.parse(); // 执行PDF解析过程
        PDDocument pdfdocument = parser.getPDDocument(); // 获取解析器的PDF文档对象
        PDFTextStripper pdfstripper = new PDFTextStripper(); // 生成PDF文档内容剥离器
        String contenttxt = pdfstripper.getText(pdfdocument); // 利用剥离器获取文档
        File file = new File("F:\\workspace\\应知应会.txt");
        FileWriter writer = new FileWriter(file);
        writer.write(contenttxt);
        System.out.println(contenttxt);
        accessRead.close();
        pdfdocument.close();
        writer.close();
        return contenttxt;
    }
}
